Using purrr for quick summary tables - St Koo

Adapted from this fantasic Learn to Purrr tutorial

# Load example data
data("mtcars")

head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

You can use purrr to loop over all the columns, and output the info into a dataframe.

In this example, I want to see variable type, max character length, and missingness.

library(tidyverse) # Includes purrr

# Create dataframe with some summary info
summary_df <- mtcars %>%
  purrr::map_df(
    ~data.frame(
      class = class(.x),
      max_char = max(nchar(.x, keepNA = FALSE)),
      missing = sum(is.na(.x))
    ), .id ="col_name"
  )

summary_df
##    col_name   class max_char missing
## 1       mpg numeric        4       0
## 2       cyl numeric        1       0
## 3      disp numeric        5       0
## 4        hp numeric        3       0
## 5      drat numeric        4       0
## 6        wt numeric        5       0
## 7      qsec numeric        5       0
## 8        vs numeric        1       0
## 9        am numeric        1       0
## 10     gear numeric        1       0
## 11     carb numeric        1       0

And because it’s a dataframe, you can use a package like kableExtra to format it for reports.

library(kableExtra)

summary_df %>%
  kableExtra::kbl() %>%
  kableExtra::kable_styling()
col_name class max_char missing
mpg numeric 4 0
cyl numeric 1 0
disp numeric 5 0
hp numeric 3 0
drat numeric 4 0
wt numeric 5 0
qsec numeric 5 0
vs numeric 1 0
am numeric 1 0
gear numeric 1 0
carb numeric 1 0

Fitting several linear models by group with purrr - Anna Quaglieri

For the example I am going to use the flights dataset from the R package nycflights13. I am going to fit linear model that tries to explain the arr_time as a function of dep_time and arr_delay.

library(nycflights13)
library(purrr)

flights %>%
  dplyr::select(arr_time, dep_time, arr_delay, carrier) %>%
  head()
## # A tibble: 6 x 4
##   arr_time dep_time arr_delay carrier
##      <int>    <int>     <dbl> <chr>  
## 1      830      517        11 UA     
## 2      850      533        20 UA     
## 3      923      542        33 AA     
## 4     1004      544       -18 B6     
## 5      812      554       -25 DL     
## 6      740      554        12 UA

To fit the model to the whole dataset we would use the following code:

summary(lm(arr_time ~ dep_time + arr_delay, data = flights))
## 
## Call:
## lm(formula = arr_time ~ dep_time + arr_delay, data = flights)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2250.59   -68.14    50.97   169.26  2342.72 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 491.250510   2.045943   240.1   <2e-16 ***
## dep_time      0.757657   0.001446   524.1   <2e-16 ***
## arr_delay    -1.633355   0.015815  -103.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 392.8 on 327343 degrees of freedom
##   (9430 observations deleted due to missingness)
## Multiple R-squared:  0.4566, Adjusted R-squared:  0.4566 
## F-statistic: 1.375e+05 on 2 and 327343 DF,  p-value: < 2.2e-16

What if we wanted to fit separate models by carrier?

models <- flights %>%
  dplyr::select(arr_time, dep_time, arr_delay, carrier) %>%
  tidyr::nest(-carrier) %>%
  dplyr::mutate(fit = purrr::map(data, ~ lm(arr_time ~ dep_time + arr_delay, data = .))) %>%
  dplyr::mutate(results_fit = purrr::map(fit, function(f) confint(f))) 
models
## # A tibble: 16 x 4
##    carrier data                  fit    results_fit      
##    <chr>   <list>                <list> <list>           
##  1 UA      <tibble [58,665 × 3]> <lm>   <dbl[,2] [3 × 2]>
##  2 AA      <tibble [32,729 × 3]> <lm>   <dbl[,2] [3 × 2]>
##  3 B6      <tibble [54,635 × 3]> <lm>   <dbl[,2] [3 × 2]>
##  4 DL      <tibble [48,110 × 3]> <lm>   <dbl[,2] [3 × 2]>
##  5 EV      <tibble [54,173 × 3]> <lm>   <dbl[,2] [3 × 2]>
##  6 MQ      <tibble [26,397 × 3]> <lm>   <dbl[,2] [3 × 2]>
##  7 US      <tibble [20,536 × 3]> <lm>   <dbl[,2] [3 × 2]>
##  8 WN      <tibble [12,275 × 3]> <lm>   <dbl[,2] [3 × 2]>
##  9 VX      <tibble [5,162 × 3]>  <lm>   <dbl[,2] [3 × 2]>
## 10 FL      <tibble [3,260 × 3]>  <lm>   <dbl[,2] [3 × 2]>
## 11 AS      <tibble [714 × 3]>    <lm>   <dbl[,2] [3 × 2]>
## 12 9E      <tibble [18,460 × 3]> <lm>   <dbl[,2] [3 × 2]>
## 13 F9      <tibble [685 × 3]>    <lm>   <dbl[,2] [3 × 2]>
## 14 HA      <tibble [342 × 3]>    <lm>   <dbl[,2] [3 × 2]>
## 15 YV      <tibble [601 × 3]>    <lm>   <dbl[,2] [3 × 2]>
## 16 OO      <tibble [32 × 3]>     <lm>   <dbl[,2] [3 × 2]>
expand_models <- models %>%
  tidyr::unnest(results_fit, .drop=TRUE) 
expand_models
## # A tibble: 48 x 4
##    carrier data                  fit    results_fit[,1]    [,2]
##    <chr>   <list>                <list>           <dbl>   <dbl>
##  1 UA      <tibble [58,665 × 3]> <lm>           491.    511.   
##  2 UA      <tibble [58,665 × 3]> <lm>             0.757   0.771
##  3 UA      <tibble [58,665 × 3]> <lm>            -1.83   -1.66 
##  4 AA      <tibble [32,729 × 3]> <lm>           434.    455.   
##  5 AA      <tibble [32,729 × 3]> <lm>             0.822   0.838
##  6 AA      <tibble [32,729 × 3]> <lm>            -1.02   -0.852
##  7 B6      <tibble [54,635 × 3]> <lm>           843.    870.   
##  8 B6      <tibble [54,635 × 3]> <lm>             0.406   0.425
##  9 B6      <tibble [54,635 × 3]> <lm>            -2.66   -2.42 
## 10 DL      <tibble [48,110 × 3]> <lm>           418.    436.   
## # … with 38 more rows
expand_models$fit[1]
## [[1]]
## 
## Call:
## lm(formula = arr_time ~ dep_time + arr_delay, data = .)
## 
## Coefficients:
## (Intercept)     dep_time    arr_delay  
##     501.083        0.764       -1.746

Column-wise operations with mutate, across and case_when - Anna Quaglieri

I found the method below really useful to recode the levels of one or several columns!

library(dplyr)

test_data <- data.frame(area = rep(c("North", "Sud", "East", "West"),times = c(2,3,4,1)),
                        quality_before = c("High","Low",
                                    "High","Low","Medium",
                                    "Medium","Low","High","High",
                                    "Low"),
                        quality_after = c("High","High",
                                    "High","Medium","Medium",
                                    "Low","Low","High","High",
                                    "Low"))

test_data %>%
  mutate(across(.cols = c(quality_before, quality_after),
            ~ case_when(
              . == "Low" ~ 0,
              . == "Medium" ~ 1,
              . == "High" ~ 2
            )
          )
  )
##     area quality_before quality_after
## 1  North              2             2
## 2  North              0             2
## 3    Sud              2             2
## 4    Sud              0             1
## 5    Sud              1             1
## 6   East              1             0
## 7   East              0             0
## 8   East              2             2
## 9   East              2             2
## 10  West              0             0

Strongly suggest to have a look at other functions and applications to perform column-wise operations https://cran.r-project.org/web/packages/dplyr/vignettes/colwise.html.

Useful R Tips by Song

Before we start

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] nycflights13_1.0.1 kableExtra_1.3.1   forcats_0.5.0      stringr_1.4.0     
##  [5] dplyr_1.0.0        purrr_0.3.4        readr_1.3.1        tidyr_1.1.0       
##  [9] tibble_3.0.4       ggplot2_3.3.2      tidyverse_1.3.0    knitr_1.30        
## [13] png_0.1-7          magick_2.4.0      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.0  xfun_0.19         haven_2.3.1       lattice_0.20-41  
##  [5] colorspace_2.0-0  vctrs_0.3.4       generics_0.0.2    viridisLite_0.3.0
##  [9] htmltools_0.5.0   yaml_2.2.1        utf8_1.1.4        blob_1.2.1       
## [13] rlang_0.4.8       pillar_1.4.6      withr_2.3.0       glue_1.4.2       
## [17] DBI_1.1.0         dbplyr_1.4.4      modelr_0.1.8      readxl_1.3.1     
## [21] lifecycle_0.2.0   munsell_0.5.0     gtable_0.3.0      cellranger_1.1.0 
## [25] rvest_0.3.5       evaluate_0.14     fansi_0.4.1       highr_0.8        
## [29] broom_0.5.6       Rcpp_1.0.5        backports_1.2.0   scales_1.1.1     
## [33] webshot_0.5.2     jsonlite_1.7.1    fs_1.4.2          hms_0.5.3        
## [37] digest_0.6.27     stringi_1.5.3     cli_2.1.0         tools_4.0.2      
## [41] magrittr_1.5      crayon_1.3.4      pkgconfig_2.0.3   ellipsis_0.3.1   
## [45] xml2_1.3.2        reprex_0.3.0      lubridate_1.7.9   assertthat_0.2.1 
## [49] rmarkdown_2.3     httr_1.4.2        rstudioapi_0.13   R6_2.5.0         
## [53] nlme_3.1-148      compiler_4.0.2

Data Exploration

I code in tidyverse universe plus tidylog to output all message corresponding to changes made to vector, dataframe, tibble, etc. Please find tidylog package @ https://github.com/elbersb/tidylog.

  • An initial step is often to preview a dataset. An alternative to summary is describe in Hmisc package. The benefits will be it counts number of NAs as summary does, but it also show a frequency table for factor/character.
library(tidyverse)

# colnames(iris)
# summary(iris)
# str(iris)

library(Hmisc)

iris$Species %>% describe
## . 
##        n  missing distinct 
##      150        0        3 
##                                            
## Value          setosa versicolor  virginica
## Frequency          50         50         50
## Proportion      0.333      0.333      0.333
  • tidylog reduces significant code verification and avoid many errors for me. It warns you when NA is generated due to situation not considered.
library(tidylog, warn.conflicts = FALSE, quietly = FALSE)

new_dt <- iris %>% 
  filter(Sepal.Length >= 4.6) %>% 
  mutate(new_name = case_when(
    Species == "versicolor" ~ "V",
    Species == "setosa" ~ "S"))
## filter: removed 5 rows (3%), 145 rows remaining
## mutate: new variable 'new_name' (character) with 3 unique values and 34% NA
  • Compare with revised code: no new NA was generated after the mutation.
library(tidylog, warn.conflicts = FALSE, quietly = FALSE)

new_dt <- iris %>% 
  filter(Sepal.Length >= 4.6) %>% 
  mutate(new_name = case_when(
    Species == "versicolor" ~ "Versicolor",
    Species == "setosa" ~ "Setosa",
    TRUE ~ "Virginica"))
## filter: removed 5 rows (3%), 145 rows remaining
## mutate: new variable 'new_name' (character) with 3 unique values and 0% NA

Data Preparation

Use iris dataframe as an example.

  • relocate also support .after, and combine use with where.
dt <- head(iris,5)

# dt %>% select("Species", everything(.))
dt %>% relocate("Species", .before = "Sepal.Length")
##   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  setosa          5.1         3.5          1.4         0.2
## 2  setosa          4.9         3.0          1.4         0.2
## 3  setosa          4.7         3.2          1.3         0.2
## 4  setosa          4.6         3.1          1.5         0.2
## 5  setosa          5.0         3.6          1.4         0.2
# dt %>% relocate(where(is.numeric), .after = where(is.factor))
  • Similar concept can be applied to a vector through SOfun package. I found this useful when adjusting factor levels. Of course, fct_reorder and fct_relevel are useful in different situations.
library(devtools)
# install_github("mrdwab/SOfun", force=TRUE)
library(SOfun)
v <- letters[1:7]
v %>% moveMe(., "a last; b, e, g before d; c first; g after b")
## [1] "c" "b" "g" "e" "d" "f" "a"
  • Another high frequency task is to manage NAs. This is way to spot hidden NAs.
hidden_na_dt <- data.frame(
  "student" = rep(c("A", "B", "C"),2),
  "assignment" = rep(c("A1", "A2"),3),
  "mark" = c(NA, runif(n = 5, min = 45, max = 100))
) %>% 
  filter(!is.na(mark))

hidden_na_dt
##   student assignment     mark
## 1       B         A2 54.06168
## 2       C         A1 70.47459
## 3       A         A2 94.28867
## 4       B         A1 66.39432
## 5       C         A2 89.64198
  • apply complete from dplyr package to fill 0 in missing mark from assignment 1 for student A. If there is more combinations, multiple items can be nesting in the complete argument.
hidden_na_dt  %>% 
  complete(student, nesting(assignment), fill = list(mark = 0))
## # A tibble: 6 x 3
##   student assignment  mark
##   <chr>   <chr>      <dbl>
## 1 A       A1           0  
## 2 A       A2          94.3
## 3 B       A1          66.4
## 4 B       A2          54.1
## 5 C       A1          70.5
## 6 C       A2          89.6

Data Visualisation

I believe ggplot2 / plotly is relative popular in practice. I also recommend highercharter to visualize timeseries data and/or visNetwork / igraph / ggraph to visualize networks. My focus today is labeling inside a chart, so that I will use ggplot2 to demonstrate.

  • Randomly picked a few countries by max number of population of that country to show potential difference when treating labeling.
plt_original <- population %>% 
  filter(country %in% c("India", "United States of America", "Viet Nam",
                        "Lao People's Democratic Republic")) %>% 
  ggplot(aes(x = year, y = population, group = country, color = country))+
  geom_line()

plt_original

Functions that I used to improve numeric formatting

The purpose of having customized functions is to improve readability and reduce cognitive load for digesting information provided by visualization.

  • The function I grabbed from stackoverflow and made two adaptations: (1) allow the function to accept negative inputs and (2) expand to recognize trillions.
si_num <- function (x) {
  
  if (!is.na(x)) {
    
    if (x < 0){ 
      sign <-  "-"
      x <- abs(x)
    }else{
      sign <-  ""
      x <- x
    }
    
    if (x >= 1e12) { 
      chrs <- strsplit(format(x, scientific=12), split="")[[1]];
      len <- chrs[seq(1,length(chrs)-12)] %>% length();
      rem <- chrs[seq(1,length(chrs)-11)];
      rem <- append(rem, ".", after = len) %>% append("T");
    }
        
    if (x >= 1e9) { 
      chrs <- strsplit(format(x, scientific=12), split="")[[1]];
      len <- chrs[seq(1,length(chrs)-9)] %>% length();
      rem <- chrs[seq(1,length(chrs)-8)];
      rem <- append(rem, ".", after = len) %>% append("B");
    }
    
    
    else if (x >= 1e6) { 
      chrs <- strsplit(format(x, scientific=12), split="")[[1]];
      len <- chrs[seq(1,length(chrs)-6)] %>% length();
      rem <- chrs[seq(1,length(chrs)-5)]
      rem <- append(rem, ".", after = len) %>% append("M");
    }
    
    else if (x >= 1e3) { 
      chrs <- strsplit(format(x, scientific=12), split="")[[1]];
      len <- chrs[seq(1,length(chrs)-3)] %>% length();
      rem <- chrs[seq(1,length(chrs)-2)];
      rem <- append(rem, ".", after = len) %>% append("K");
    }
    
    else {
      return(x);
    }
    
    return(str_c(sign, paste(rem, sep="", collapse=""), sep = ""));
  }
  else return(NA);
} 

si_vec <- function(x) {
  sapply(x, FUN=si_num);
}
  • Modifications include: (1) change graph title and axis titles and format, (2) change a theme: minimalist design, (3) remove legend and add text labels to each line.
  • Of course, there are more things: change color pallet defined for country, graph size,…
library(hrbrthemes)
library(scales)
library(ggrepel)
library(cowplot)


year_series <- unique(population$year)
reminder <- (max(year_series) - min(year_series)) %% 4
new_breaks <- seq(from = min(year_series) + reminder, to = max(year_series), by = 4) 

df <- population %>% 
  filter(country %in% c("India", "United States of America", "Viet Nam",
                        "Lao People's Democratic Republic")) 
df_end <- df %>% 
  group_by(country) %>% 
  filter(year == max(year)) %>% 
  ungroup()

plt_adjust <- df %>% 
  ggplot(aes(x = year, y = population, group = country, color = country))+
  geom_line()+
  geom_point()+
  geom_text_repel(
    data = df_end,
    aes(label = str_wrap(country,25)),
    nudge_x = 1,
    direction = "y",## nudge vertically
    size = 3,
    hjust = 0, ### left aligned
    segment.size = 0.3, ### from here is about the line to connect the data point and text
    min.segment.length = 0,
    segment.color = "grey60") + 
  theme_ipsum() +
  theme(legend.position = "none") +
  scale_y_continuous(labels = si_vec)+
  scale_x_continuous(breaks = new_breaks, limits = c(NA, 2020))+
  labs(x = "Year", y = "Population", title = "Population Growth between 1995 and 2013")


plt_original

plt_adjust

  • Or, put it into plotly, the default hover over message often does not satisfy users, more professional format is recommended to be used in hover over text.
library(plotly)

plt_plotly <- df %>% 
  mutate(text = str_c("Country: ", country, "\n",
                      "Year: ", year, "\n",
                      "Population: ", si_vec(population))) %>% 
  ggplot(aes(x = year, y = population, group = country, color = country, text = text))+
  geom_line()+
  geom_point()+
  theme_ipsum() +
  theme(legend.position = "none") +
  scale_y_continuous(labels = si_vec)+
  scale_x_continuous(breaks = new_breaks)+
  labs(x = "Year", y = "Population", title = "Population Growth between 1995 and 2013")



ggplotly({plt_plotly}, tooltip = "text")

At the end

To be continue, I have coded many interactive plots in shinyapps, and some can be found from https://coffeeandplot.com/apps/. This is a relatively new website we created couples of month ago. Get in touch if you have any suggestions. Please find me @ https://www.linkedin.com/in/ytsong/.

Render an RMarkdown report - Sehrish Kanwal

This section describes how to render an RMarkdown report within a simple R conda environment on a Command Line Interface (cluster or linux environment). This could be achieved in two possible ways:

Both work but the second way is the recommended one, which will be described below.

  1. Create an environment.yml file, that looks something like

    #name of the conda environment
    name: HowRYou
    
    #the paths that conda takes a look for packages. Avoid using anaconda channel as we have
    #experienced issues using it 
    channels:
        - conda-forge
        - bioconda
        - defaults
    
    #install following packages in the conda environment
    #change according to the packages you are using in your RMardown file. 
    #The first three are required (are R essentail). You can also change the versions to
    # meet the requirements of your analysis 
    dependencies:
        - r-base=3.4.1
        - pandoc=1.19
        - r-rmarkdown=1.6
        - r-here
  2. Create a conda environment (in this case HowRYou is the conda environment name specified in the environment.yml file. -p flag should point to your miniconda installation path. To find how to install conda, check this

    conda env create -p /path/to/miniconda/envs/HowRYou --file environment.yml
  3. Activate this conda environment

    conda activate HowRYou
  4. Run the RMarkdown file

    Rscript -e "rmarkdown::render('HowRYou.Rmd')"

    To pass arguments to the Rmd script (in this case two arguments - an input directory location and name of the input vcf file)

    Rscript -e "rmarkdown::render('HowRYou.Rmd', params = list(directory = './data', file = 'dummy.txt'))"
  5. An example of a rendered script used in the above step # 4

Advantages:

  • Reproducibility - ability to perform the analysis multiple times
  • Portability - being able to move code from one machine or platform to another
  • Flexibility - change easily in response to different user and system requirements

Pathwork - Shazia Ruybal

# install.packages(c("tidyverse", "patchwork, palmerpenguins", "devtools"))
# devtools::install_github("bahlolab/ggwehi")
library(tidyverse)
library(patchwork)
# library(ggwehi)
library(palmerpenguins)
plot_1 <- penguins %>%
  ggplot(aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
    geom_point() +
   # scale_color_wehi() +
    labs(x = "Flipper length (mm)",
         y = "Body mass (g)",
         color = "Penguin species") +
    theme_minimal()
plot_2 <- penguins %>% 
  ggplot(aes(x = flipper_length_mm, fill = species)) +
    geom_histogram(alpha = 0.5, position = "identity") +
   # scale_fill_wehi() +
    labs(x = "Flipper length (mm)",
         y = "Frequency",
         fill = "Penguin species") +
    theme_minimal()
plot_1 + plot_2 + # plot them side-by-side
  plot_layout(guides = "collect") + # this "collects" the legends to the right of the plot
  plot_annotation(tag_levels = "a", tag_suffix = ")") # this adds panel labels

plot_1 / plot_2 + # plot them one over the other
  plot_layout(guides = "collect") + # this "collects" the legends to the right of the plot
  plot_annotation(tag_levels = "a", tag_suffix = ")") # this adds panel labels

How to Generate Word Clouds in R (The simplest way) - Zeinab Manzari

# install.packages(c("wordcloud","RColorBrewer))
library(wordcloud) #wordcloud for generating word cloud images
library(RColorBrewer) #RColorBrewer for color palettes
words <- c("RLadies", "Rmarkdown", "tips", "tricks", "script", "rmd", "console", "packages", "share", "4thanniversary", "celebrate", 
           "RSoftware", "Australia", "Melbourne", "Girls", "Learn","Teach", "Data Structure", "Algorithm", "Visualisation", 
           "Code", "Data", "ggplot2", "Zoom", "Help", "Text", "RStudio", "programing", "questions", "answers", "Plot", "happy")

freqs <- c(980, 900, 498, 811, 800, 654, 489, 90, 254, 500, 600, 200, 488, 400, 140, 250, 357, 789, 147, 120, 590, 741, 100, 788, 812, 693, 410, 753, 95, 80, 594, 644)
set.seed(3)

wordcloud(words = words,freqs = freqs, scale=c(4,.4), max.words = 1000, random.order=TRUE, random.color=TRUE, colors = brewer.pal(8, "Accent"))

Insert a time-stamp in RStudio - Adele Barughare

In a code chunk or in an R script, insert a timestamp with ts + shift + tab.

First write ts:

ts
## function (data = NA, start = 1, end = numeric(), frequency = 1, 
##     deltat = 1, ts.eps = getOption("ts.eps"), class = if (nseries > 
##         1) c("mts", "ts", "matrix") else "ts", names = if (!is.null(dimnames(data))) colnames(data) else paste("Series", 
##         seq(nseries))) 
## {
##     if (is.data.frame(data)) 
##         data <- data.matrix(data)
##     if (is.matrix(data)) {
##         nseries <- ncol(data)
##         ndata <- nrow(data)
##         dimnames(data) <- list(NULL, names)
##     }
##     else {
##         nseries <- 1
##         ndata <- length(data)
##     }
##     if (ndata == 0) 
##         stop("'ts' object must have one or more observations")
##     if (missing(frequency)) 
##         frequency <- 1/deltat
##     else if (missing(deltat)) 
##         deltat <- 1/frequency
##     if (frequency > 1 && 0 < (d <- abs(frequency - round(frequency))) && 
##         d < ts.eps) 
##         frequency <- round(frequency)
##     if (!missing(start)) 
##         start <- as.numeric(start)
##     if (length(start) > 1L) {
##         start <- start[1L] + (start[2L] - 1)/frequency
##     }
##     if (!missing(end)) 
##         end <- as.numeric(end)
##     if (length(end) > 1L) {
##         end <- end[1L] + (end[2L] - 1)/frequency
##     }
##     if (missing(end)) 
##         end <- start + (ndata - 1)/frequency
##     else if (missing(start)) 
##         start <- end - (ndata - 1)/frequency
##     if (start > end) 
##         stop("'start' cannot be after 'end'")
##     cycles <- as.numeric((end - start) * frequency)
##     if (abs(round(cycles) - cycles) > ts.eps * max(cycles, 1)) 
##         stop("'end' must be a whole number of cycles after 'start'")
##     nobs <- floor(cycles + 1.01)
##     if (nobs != ndata) 
##         data <- if (NCOL(data) == 1) {
##             if (ndata < nobs) 
##                 rep_len(data, nobs)
##             else if (ndata > nobs) 
##                 data[1L:nobs]
##         }
##         else {
##             if (ndata < nobs) 
##                 data[rep_len(1L:ndata, nobs), ]
##             else if (ndata > nobs) 
##                 data[1L:nobs, ]
##         }
##     attr(data, "tsp") <- c(start, end, frequency)
##     if (!is.null(class) && class[[1]] != "none") 
##         attr(data, "class") <- class
##     data
## }
## <bytecode: 0x7fa35aaa87c0>
## <environment: namespace:stats>

Then press shift + tab:

# Wed Nov 11 10:42:54 2020 ------------------------------

Making comparisons by groups? - Emi Tanaka

ggplot(ChickWeight, aes(Time, weight, group = Chick)) +
  geom_line(aes(color = Diet)) +
  facet_wrap(~Diet) +
  theme_bw() +
  labs(y = "Weight") +
  scale_color_viridis_d()

ggplot(ChickWeight, aes(Time, weight, group = Chick)) +
  geom_line(data = select(ChickWeight, -Diet), color = "gray") + 
  geom_line(aes(color = Diet)) +
  facet_wrap(~Diet) +
  theme_bw() +
  labs(y = "Weight") +
  scale_color_viridis_d()

R Tips for (Sport and Non-Sport) Scientists and/ or those who simply love using R! - Alice Sweeting

Below are some tips and tricks that I have picked up along my R journey, that may (or may not!) be useful for those working with sport science data, or any data, in R. I hope you find the tips useful!


Visualising Netball Data

If you are interested in working with some real life netball data, you can see a slidedeck that I put together of how to import, tidy and analyse the data here.

Working with Files from Drobpbox

Sometimes, someone will share some data with us via Dropbox. Often these files can be really big and it is annoying to download a local copy of these files on your machine. If you have been sent a link to view these files, the following may be useful so you can load them directly into R, without having to save the file locally on your machine first.

  1. Get “Share (or Copy) Dropbox Link” from file. This might look like: https://www.dropbox.com/s/sometexthere/NameOfFile.csv?dl=0
  2. Change dl=0 to dl=1 at end of link. This might look like: https://www.dropbox.com/s/sometexthere/NameOfFile.csv?dl=1
  3. Run the following code
# Install the {vroom} package
install.packages(vroom)
# Load the package from your library
library(vroom)
# Load your data directly into R!
data <- vroom("https://www.dropbox.com/s/sometexthere/NameOfFile.csv?dl=1")

Quickly importing Athlete Management System (AMS) Data into R

Use R to quickly import AMS your data and not even open a web browser! To do this, please follow the steps below. Note – I am using Smartabase as an example here because it is familiar to me.

  1. Save a report of your data within Smartabase. For example, save a report that contains all historical athlete wellness data.
  2. Open R and use the code below to import your data. Note, you will need to add your own username/ password/ URL details that are specific to your team.
# Load required packages
library(rvest)
library(plyr)
library(dplyr)
library(qdap)
# Connect to a report that you have generated via Smartabase
WellnessDataURL <- html_session("https://username:password@my2.smartabase.com/yourteamsname/live?report=WellnessData&updategroup=true")
# Read in data
WellnessData <- read_html(WellnessDataURL)
# Identify the table
WellnessDataTable <- html_nodes(WellnessData, "table")
# Collect only table data
WellnessDataTable1 <- html_table(WellnessDataTable[1], fill = TRUE)
# Make data.frame
HistoricalWellnessData <- as.data.frame(WellnessDataTable1)
# Clean Environment
rm(list = grep("^HistoricalWellnessData", ls(), value = TRUE, invert = TRUE))

Now your AMS data is in a neat data.frame and ready for any further statistical analysis or visualisation using R, without needing to open a web browser!


![

Making a cartogram of the 2020 Victorian COVID-19 outbreak - Dianne Cook

In Melbourne we have been in a strict lockdown since July. Each week we get our hopes up that restrictions might be eased, and once again these hopes are dashed by the announcement Sunday Oct 25, keeping the restrictions a little longer because of another outbreak in the northwest of the city. The data we have collected here are the case counts by Victorian local government area (LGA) since the beginning of July. We will examine the spatiotemporal distribution of these counts.

Working with spatial data is always painful! It almost always requires some ugly code. Part of the reason for the difficulty is the use of special data objects, that describe maps. There are several different choices, and some packages and tools use one, and others use another, so not all tools work together. The sf package is a recent endeavour that helps enormously, but some tools still use other forms, and when you run into errors this might be the reason - it can be hard to tell. Another reason is that map objects can be very large, which makes sense for accurate mapping, but for data analysis and visualisation, we’d rather have smaller, even if slightly inaccurate, spatial objects. It can be helpful to thin out map data before doing further analysis - you need special tools for this, eg mapshapr. We don’t need this for the exercises here. Another problem commonly encountered is that there are numerous coordinate systems, and types of projections of the 3D globe into a 2D canvas. We have become accustomed to lat/long but like time its an awkward scale to compute on because a translation from E/W and N/S to positive and negative values is needed. More commonly a Universal Transverse Mercator (UTM) is the standard but its far less intuitive to use.

The code for all the analysis is provided for you. We recommend that you run the code in steps to see what it is doing, why the mutating and text manipulations are necessary. Talk about the code with each other to help you understand it.

Getting the data

COVID-19 counts

COVID-19 data by LGA is available from https://www.covid19data.com.au/victoria. You should find that some variables are type chr because “null” has been used to code entries on some days. This needs fixing, and also missings should be converted to 0, with the rationale being that if the value is missing it most likely means there were no cases.

# Read the data
# Replace null with 0, for three LGAs
# Convert to long form to join with polygons
# Make the date variables a proper date
# Set NAs to 0, this is a reasonable assumption
covid <- read_csv("https://raw.githubusercontent.com/numbats/eda/master/data/melb_lga_covid.csv") %>%
  mutate(Buloke = as.numeric(ifelse(Buloke == "null", "0", Buloke))) %>%
  mutate(Hindmarsh = as.numeric(ifelse(Hindmarsh == "null", "0", Hindmarsh))) %>%
   mutate(Towong = as.numeric(ifelse(Towong == "null", "0", Towong))) %>%
  pivot_longer(cols = Alpine:Yarriambiack, names_to="NAME", values_to="cases") %>%
  mutate(Date = ydm(paste0("2020/",Date))) %>%
  mutate(cases=replace_na(cases, 0))

# Case counts are cumulative, so take lags to get daily case counts
covid <- covid %>%
  group_by(NAME) %>%
  mutate(new_cases = cases - dplyr::lag(cases))

# Filter to final day, which is cumulative count
covid_cumul <- covid %>% 
  filter(Date == max(Date)) 

Spatial polygons

Now let’s get polygon data of Victorian LGAs from the ozmaps package. We need to fix some names of LGAs because there are duplicated LGA names, and there is one mismatch in names from the COVID data and the ozmaps data (Colac Otway). If the COVID data had been provided with a unique LGA code it would have helped in merging with the polygon data.

# Read the LGA data from ozmaps package. 
# This has LGAs for all of Australia. 
# Need to filter out Victoria LGAs, avoiding LGAs 
# from other states with same name, and make the names
# match covid data names. The regex equation is
# removing () state and LGA type text strings
# Good reference: https://r-spatial.github.io/sf/articles/sf1.html
data("abs_lga")
vic_lga <- abs_lga %>%
  mutate(NAME = ifelse(NAME == "Latrobe (M) (Tas.)", "LatrobeM", NAME)) %>%
  mutate(NAME = ifelse(NAME == "Kingston (DC) (SA)", "KingstonSA", NAME)) %>%
  mutate(NAME = ifelse(NAME == "Bayside (A)", "BaysideA", NAME)) %>% 
  mutate(NAME = str_replace(NAME, " \\(.+\\)", "")) %>%
  mutate(NAME = ifelse(NAME == "Colac-Otway", "Colac Otway", NAME)) 
vic_lga <- st_transform(vic_lga, 3395) 
# 3395 is EPSG CRS, equiv to WGS84 mercator, 
# see https://spatialreference.org/ref/epsg/?page=28
# cartogram() needs this to be set

Choropleth map

A choropleth map is made from filling the colour of polygons.

# Join covid data to polygon data, remove LGAs with 
# missing values which should leave just Vic LGAs
vic_lga_covid <- vic_lga %>%
  left_join(covid_cumul, by="NAME") %>%
  filter(!is.na(cases))

# Make choropleth map, with appropriate colour palette
choropleth <- ggplot(vic_lga_covid) + 
  geom_sf(aes(fill = cases, label=NAME), colour="white") + 
  scale_fill_distiller("Cases", palette = "YlOrRd",
                       direction=1) + 
  theme_map() +
  theme(legend.position="bottom")
choropleth

#ggplotly(choropleth) # Interactive map

Population-transformed cartogram

Get population data

The file VIF2019_Population_Service_Ages_LGA_2036.xlsx has been extracted from the Vic Gov web site. It is a complicated xlsx file, with the data in sheet 3, and starting 13 rows down. The readxl package is handy here to extract the population data needed. The code below has extracted only the data needed.

pop <- tibble(NAME = c("Alpine","Ararat","Ballarat","Banyule","Bass Coast","Baw Baw","Bayside","Benalla","Boroondara","Brimbank","Buloke","Campaspe","Cardinia","Casey","Central Goldfields","Colac Otway","Corangamite","Darebin","East Gippsland","Frankston","Gannawarra","Glen Eira","Glenelg","Golden Plains","Greater Bendigo","Greater Dandenong","Greater Geelong","Greater Shepparton","Hepburn","Hindmarsh","Hobsons Bay","Horsham","Hume","Indigo","Kingston","Knox","Latrobe","Loddon","Macedon Ranges","Manningham","Mansfield","Maribyrnong","Maroondah","Melbourne","Melton","Mildura","Mitchell","Moira","Monash","Moonee Valley","Moorabool","Moreland","Mornington Peninsula","Mount Alexander","Moyne","Murrindindi","Nillumbik","Northern Grampians","Port Phillip","Pyrenees","Queenscliffe","South Gippsland","Southern Grampians","Stonnington","Strathbogie","Surf Coast","Swan Hill","Towong","Wangaratta","Warrnambool","Wellington","West Wimmera","Whitehorse","Whittlesea","Wodonga","Wyndham","Yarra","Yarra Ranges","Yarriambiack"), 
pop = c(12578,11746.43,103500.3,127447,33464.85,49296.21,102912,13981.3,177276,204190,6284,37596.09,97572.66,312789,13085.32,21362.81,16241,155126,45598.55,139502,10567.15,148583,19758.61,22016,112270.9,160222,239529.9,65071.32,15526.87,5787.223,93445.04,19884.51,207038,16165.73,158937.6,160353.5,74622.36,7559.041,47479.75,122570.7,8674.158,86942,114799.3,146097.1,141422.1,54658,41794.85,29486,192625,122871.1,32668.76,172289.5,161528,19093.7,16738.47,14052.73,64174,11570.29,108627.2,7315.398,2927.166,29120.95,16122.74,111003,10357.01,30465.01,20895.86,6045.765,28592,34243.1,43531.44,3932.907,169641.6,207058,40100,227008,92898.52,155227.4,6742.772))

vic_lga_covid <- vic_lga_covid %>%
  left_join(pop, by="NAME") 

# Compute additional statistics
vic_lga_covid <- vic_lga_covid %>%
  group_by(NAME) %>%
  mutate(cases_per10k = max(cases/pop*10000, 0)) %>%
  ungroup()

Make the cartogram

vic_lga_covid_carto <- cartogram_cont(vic_lga_covid, "pop")
# This st_cast() is necessary to get plotly to work
vic_lga_covid_carto <- st_cast(vic_lga_covid_carto, "MULTIPOLYGON") 

cartgram <- ggplot(vic_lga_covid_carto) + 
  geom_sf(aes(fill = cases_per10k, label=NAME), colour="white") + 
  scale_fill_distiller("Cases", palette = "YlOrRd",
                       direction=1) + 
  theme_map() +
  theme(legend.position="bottom") 
cartgram 

#ggplotly(cartgram) # Interactive cartogram

Lastly, a hexagon tile map

The hexagon tile map makes tiled hexagons representing the LGAs. You can read more about it in the documentation for the sugarbag package at https://srkobakian.github.io/sugarbag/.

# Spatial coordinates need to be in long/lat
vlc_latlong <- st_transform(vic_lga_covid, crs = "+proj=longlat +datum=WGS84")

# Placement of hexmaps depends on position relative to
# Melbourne central
data(capital_cities)
vic_lga_hexmap <- create_hexmap(
  shp = vlc_latlong,
  sf_id = "NAME",
  focal_points = capital_cities, verbose = TRUE)
# This shows the centroids of the hexagons
# ggplot(vic_lga_hexmap, aes(x=hex_long, y=hex_lat)) +
#  geom_point()

# Hexagons are made with the `fortify_hexagon` function
vic_lga_covid_hexmap <- vic_lga_hexmap %>%
  fortify_hexagon(sf_id = "NAME", hex_size = 0.1869) %>%
  left_join(covid_cumul, by="NAME") %>%
  filter(!is.na(cases)) %>%
  left_join(pop, by="NAME") %>%
  group_by(NAME) %>%
  mutate(cases_per10k = max(cases/pop*10000, 0)) %>%
  ungroup()

hexmap <- ggplot() +
  geom_sf(data=vlc_latlong, 
          fill = "grey90", colour = "white", size=0.1) +
  geom_polygon(data=vic_lga_covid_hexmap, 
               aes(x=long, y=lat, group=hex_id, 
                   fill = cases_per10k, 
                   colour = cases_per10k, 
                   label=NAME), 
               size=0.2) +
  scale_fill_distiller("Cases", palette = "YlOrRd",
                       direction=1) +
  scale_colour_distiller("Cases", palette = "YlOrRd",
                       direction=1) +
  theme_map() +
  theme(legend.position="bottom", aspect.ratio=0.7)
hexmap

# ggplotly(hexmap)
 

A work by R-Ladies Melbourne